We take an exploratory approach to understand our omics-data better. In the “Data decription” section the creation of the data is explained while the “Data analysis” sections performs several statistical tests, which include Shapiro-Wilks test for normality, T-test and Mann-Whitney U test for significance, and exploratory plots, which include box plots, volcano plots, SVD plots, and MDS plots. Lastly we explore the data with K-means clustering.
library(tidyverse)
library(rstatix)
library(factoextra)
library(ChAMP)
library(RColorBrewer)
library(limma)
library(ComplexHeatmap)
library(ggrepel)
pal <- brewer.pal(8,"Dark2")
metabolites <- read_csv("results/metabolite_preprocessed.csv")
proteins <- read_csv("results/protein_preprocessed.csv") %>%
dplyr::rename("patient_id" = "Patient_ID")
miRNA <- read_csv("results/miRNA_preprocessed.csv")
metadata <- read_csv("results/metadata_preprocessed.csv")
metadata_proteins <- read_csv("data/metadata_OlinkID.csv") %>%
pivot_longer(cols = c(-1), names_to = "proteins") %>%
pivot_wider(names_from = c(1))
metadata_metabolites <- read_csv("data/metadata_metabolomics.csv")
apply_shapiro <- function(data, variables, significance) {
# 1. Running the Shapiro Wilk test
results <- data %>%
shapiro_test(variables)
# 2. Filtering for the significant p-values
results %>%
filter(p < significance) %>%
nrow()
}
apply_ttest <- function(data, grp1, grp2) {
# 1. Calculating p-values
data %>%
select(-1) %>%
apply(2, function(x) {
samp1 <- x[grp1]
samp2 <- x[grp2]
test_result <- t.test(samp1, samp2)
return(test_result$p.value)
})
}
apply_mwu <- function(data, grp1, grp2) {
# 1. Calculating p-values
mwu_p <- data %>%
select(-1) %>%
apply(2, function(x) {
samp1 <- x[grp1]
samp2 <- x[grp2]
test_result <- wilcox.test(samp1, samp2)
return(test_result$p.value)})
# 2. Calculating fold change
fold_change <- data %>%
select(-1) %>%
apply(2, function(x){
samp1 <- x[grp1]
samp2 <- x[grp2]
# test_result <- mean(log2(x[seq_along(grp1)])) - mean(log2(x[(length(grp1) + 1):ncol(data)]))
test_result <- mean(samp1) - mean(samp2)
return(test_result)})
# 3. Combining to table
variables <- data %>%
select(-1) %>%
names()
mwu_table <- tibble(
variable = variables,
p.value = mwu_p,
adjusted_p.value = p.adjust(mwu_p, method = "BH"),
fold_change = fold_change)}
plot_volcano <- function(data, significance) {
data %>%
ggplot(
aes(
x = fold_change,
y = -log10(p.value),
color = p.value < significance
)) +
geom_point() +
labs(
x = "Fold change (log2)",
y = "P-value (-log10)",
color = str_glue("p < {significance}")
)
}
plot_densities <- function(data, title) {
data %>%
select(2:ncol(data)) %>%
pivot_longer(
everything(),
names_to = "variable",
values_to = "value"
) %>%
ggplot(mapping = aes(
x = value,
y = after_stat(scaled),
color = variable
)) +
geom_density() +
guides(color = "none") +
labs(
x = "Value",
y = "Density",
title = title
)}
The miRNA data is analysed for 36 samples, 33 patients and 3 healthy donors.
Method: NGS
Preprocessing: The readings are mapped to known human miRNA from the miRBase data base. !(The data output comes out in the form of counts - no modification). The facility also provides mature miRNA log transformed values (that we use here). Constant values of this dataset were removed.
Checking sample’s outliers
# Plotting the samples/individuals
# pdf(file = "results/plots/miRNA_box_plot.pdf", wi = 9, he = 6)
miRNA %>%
select(-patient_id) %>%
t() %>%
boxplot(
col = palette()[-1],
main = "miRNA distribtution", xlab = "Samples", ylab = "miRNA concentration ")
miRNA expression data often does not follow a normal distribution, even after log transformation. The number of miRNA that do not satisfy normality at p = 0.05 is: 1420
# Applying Shapiro Wilks test for normality
variables <- miRNA %>%
select(-patient_id) %>%
names()
apply_shapiro(miRNA, variables, 0.05)
## [1] 2873
The number of miRNA that were significantly different between PSC samples and controls at p = 0.05 is: 506
# Calculating the p-values
p_values <- miRNA %>%
apply_ttest(grp1 = c(1:33), grp2 = c(34:36))
hist(p_values)
# Calculating the amount of significant proteins
sum(p_values < 0.05)
## [1] 1107
# pdf(file = "results/plots/miRNA_p_histogram.pdf", wi = 9, he = 6)
mwu <- apply_mwu(miRNA, grp1 = c(1:33), grp2 = c(34:36))
hist(mwu)
datatable(mwu %>%
mutate(fold_change = abs(fold_change)),
filter = "top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv"))) %>%
formatRound(columns = 2:4, digits=6)
The number of significantly differentially expressed miRNA at p < 0.05 is 341, of which 312 are upregulated and 29 are downregulated.
# Filter for significant miRNAs (customize the conditions if needed)
# Add a new column for miRNAs to label based on the thresholds
mwu$label <- ifelse((mwu$p.value < 0.05 & (mwu$fold_change < -2.5 | mwu$fold_change > 8.5) | mwu$p.value < 0.00029 & mwu$fold_change > 6.5), mwu$variable, NA)
# pdf(file = "results/plots/volcano_miRNA.pdf")
ggplot(mwu, aes(x = fold_change, y = -log10(p.value), color = ifelse(p.value < 0.05, "Significant", "Non-significant"))) +
geom_point(alpha = 0.8, size = 1.5) +
geom_label_repel(aes(label = label),
size = 3,
box.padding = 0.3,
max.overlaps = 10,
color = "black",
show.legend = FALSE) +
scale_color_manual(values = c("Non-significant" = "royalblue", "Significant" = "firebrick")) +
theme_minimal(base_size = 14) +
theme(panel.background = element_rect(fill = "lightgray", color = NA)) +
labs(x = "Fold Change", y = "-Log10(p-value)", color = "p < 0.05") +
ggtitle("Volcano Plot of Mann-Whitney test of miRNAs")
We check the plausible confounders and the studied effects in the data
beta1 <- miRNA %>%
dplyr::slice(1:33) %>%
dplyr::select(-patient_id) %>%
t() %>%
as.data.frame()
pd1 <- metadata %>%
dplyr::slice(1:33) %>%
select(cca_binary, ibd_binary, crohn_or_uc, fibrosis, fibrosis_binary, alp, alp_binary, bilirubin, bilirubin_binary, group, sex, age, age_cat, bmi, bmi_cat, crp, alat, IgG, IgA, auto_anb, overlap_aih, steroids_other_medic, jaundice, pruritis, fever_chill, fatigue, no_symptoms) %>%
as.data.frame()
pdf(file = "results/plots/svd_miRNA.pdf")
champ.SVD(beta = beta1, pd = pd1)
# while (!is.null(dev.list())) dev.off()
Using MDS plots to check the grouping of the data
# pdf(file = "results/plots/MDS_miRNA.pdf", wi = 9, he = 6)
miRNA %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 2),
col=pal[factor(metadata$group[1:33])])
miRNA %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 3),
col=pal[factor(metadata$group[1:33])])
miRNA %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(2, 3),
col=pal[factor(metadata$group[1:33])])
The protein data is analysed for 36 samples, 33 patients and 3 healthy donors.
Method: Olink’s proximity extension assay (PEA) is used in which each protein are bound by two antibodies with tail DNA oligonucleotides. Upon binding the oligonucleotides hybridize and are extended by DNA polymerase forming a barcode that can be read by qPCR or NGS. The PEA was done with Olink Target 96, 7 panels of proteins with 92 proteins each, total of 644 proteins. The panels are “CELL REGULATION”, “DEVELOPMENT”, “IMMUNE RESPONSE”, “INFLAMMATION”, “METABOLISM”, “ONCOLOGY II”, “ONCOLOGY III”, “ORGAN DAMAGE”.
Preprocessing: Only one sample did not pass the quality control, sample ID 30 for “ONCOLOGY III”. Several proteins did not pass quality control and were removed (failed in 8% or more samples, see proteins_preprocessing.R). The data comes in the unit of normalized protein expression (NPX) values on a log2 scale. They are normalized to interplate controls.
Links Olink Target 96: https://olink.com/products/olink-target-96 Olink preprocessing for qPCR: https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf Panel proteins: https://www.bioxpedia.com/olink-proteomics/#:~:text=The%20readout%20for%20most%20Olink,equals%20a%20high%20protein%20concentration.
Preliminary Checks:
Visualize your log-transformed data (e.g., histograms, density plots) to ensure the transformation effectively reduces skewness. If outliers are present, consider their biological relevance. The Mann-Whitney test is robust to outliers but may still be influenced by their ranks.
# Plotting the samples/individuals
proteins %>%
select(-patient_id) %>%
t() %>%
limma::plotDensities()
#
# Plotting the features/variables
# proteins %>%
# select(-patient_id) %>%
# limma::plotDensities()
Checking sample and data for outliers
# Plotting the samples/individuals
# pdf(file = "results/plots/proteins_barplot.pdf", wi = 9, he = 6)
proteins %>%
select(-patient_id) %>%
t() %>%
boxplot(col = palette()[-1],
main = "Protein distribtution", xlab = "Samples", ylab = "Protein concentration")
# Plotting the features/variables
# proteins %>%
# select(-patient_id) %>%
# boxplot(col = palette()[-1],
# main = "Protein distribtution", ylab = "Protein concentration ")
# Plotting the features/variables
proteins %>%
select(-patient_id) %>%
limma::plotMDS()
The number of proteins that do not satisfy normality at p = 0.05 is:
# Applying Shapirot_Wilks test for normality
variables <- proteins %>%
select(-patient_id) %>%
names()
apply_shapiro(proteins, variables, 0.05)
## [1] 232
The number of proteins that were significantly different between PSC samples and controls at p = 0.05 is: 257
p_values <- proteins %>%
apply_ttest(grp1 = c(1:33), grp2 = c(34:36))
# pdf(file = "results/plots/proteins_histogram.pdf", wi = 9, he = 6)
hist(p_values)
# Calculating the amount of significant proteins
sum(p_values < 0.05)
## [1] 45
# pdf(file = "results/plots/proteins_p_histogram.pdf", wi = 9, he = 6)
mwu <- apply_mwu(proteins, grp1 = c(1:33), grp2 = c(34:36))
hist(mwu)
datatable(mwu %>%
inner_join(metadata_proteins %>%
dplyr::select(proteins, Assay),
by = c("variable" = "proteins")) %>%
dplyr::select(-variable) %>%
relocate(Assay),
filter = "top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv"))) %>%
formatRound(columns = 2:4, digits=6)
The number of significantly differentially expressed proteins at p < 0.05 is 22, of which 19 are upregulated and 3 are downregulated.
# pdf(file = "results/plots/proteins_volcano.pdf", wi = 9, he = 6)
# Add a new column for proteins to label based on the thresholds
mwu <- mwu %>%
inner_join(metadata_proteins %>%
dplyr::select(proteins, Assay),
by = c("variable" = "proteins"))
mwu$label <- ifelse((mwu$p.value < 0.05 & (mwu$fold_change < -1 | mwu$fold_change > 1) | mwu$p.value < 0.01 ), mwu$Assay, NA)
ggplot(mwu, aes(x = fold_change, y = -log10(p.value), color = ifelse(p.value < 0.05, "Significant", "Non-significant"))) +
geom_point(alpha = 0.8, size = 1.5) +
geom_label_repel(aes(label = label),
size = 3,
box.padding = 0.3,
max.overlaps = 10,
color = "black",
show.legend = FALSE) +
scale_color_manual(values = c("Non-significant" = "royalblue", "Significant" = "firebrick")) +
theme_minimal(base_size = 14) +
theme(panel.background = element_rect(fill = "lightgray", color = NA)) +
labs(x = "Fold Change", y = "-Log10(p-value)", color = "p < 0.05") +
ggtitle("Volcano Plot of Mann-Whitney test of proteins")
We check the plausible confounders and the studied effects in the data
beta1 <- proteins %>%
dplyr::slice(1:33) %>% ### no Control
dplyr::select(-patient_id) %>%
t() %>%
as.data.frame()
pd1 <- metadata %>%
dplyr::slice(1:33) %>%
select(cca_binary, ibd_binary, crohn_or_uc, fibrosis, fibrosis_binary, alp, alp_binary, bilirubin, bilirubin_binary, group, sex, age, age_cat, bmi, bmi_cat, crp, alat, IgG, IgA, auto_anb, overlap_aih, steroids_other_medic, jaundice, pruritis, fever_chill, fatigue, no_symptoms) %>%
as.data.frame()
# pdf(file = "results/plots/svd_proteins.pdf")
champ.SVD(beta = beta1, pd = pd1)
# while (!is.null(dev.list())) dev.off()
Using MDS plots to check the grouping of the data according to the confouders found by SVD: colors represent age categories: No much effect of age categories
# pdf(file = "results/plots/MDS_by_age_proteins.pdf")
proteins %>%
dplyr::select(-patient_id) %>%
# dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 2),
col=pal[factor(metadata$alp_binary[1:33])])
proteins %>%
dplyr::select(-patient_id) %>%
# dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 3),
col=pal[factor(metadata$alp_binary[1:33])])
proteins %>%
dplyr::select(-patient_id) %>%
# dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(2, 3),
col=pal[factor(metadata$alp_binary[1:33])])
Looking at the groups (early PSC, CCA, IBD, Advanced and progressing): there is some grouping but not good enough: some feature selection may be needed.
# pdf(file = "results/plots/MDS_proteins.pdf")
proteins %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 2),
col=pal[factor(metadata$group[1:33])])
proteins %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 3),
col=pal[factor(metadata$group[1:33])])
proteins %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(2, 3),
col=pal[factor(metadata$group[1:33])])
The metabolite data is analysed for 45 samples, 33 patients and 12 healthy donors.
Method: Ultra performance liquid chromatography + Tandem mass spectroscopy.
Preprocessing: The values are log transformed (base unknown). The missing values are imputated with the minimum observed value of that respective compound. The data is normalized so that the median/medians are 1.0000 and all other values are proportional.
log2_metabolites <- cbind(metabolites[1],
log2(metabolites[2:length(metabolites)]))
# Plotting the samples/individuals
log2_metabolites %>%
select(-patient_id) %>%
t() %>%
limma::plotDensities()
#
# # Plotting the features/variables
# proteins %>%
# select(-patient_id) %>%
# limma::plotDensities()
Checking sample’s outliers
# pdf(file = "results/plots/barplots_metabolites.pdf")
log2_metabolites %>%
select(-patient_id) %>%
t() %>%
boxplot(
col = palette()[-1],
main = "Metabolite distribtution", xlab = "Samples", ylab = "Metabolite concentration ")
# Plotting the features/variables
# log2_metabolites %>%
# select(-patient_id) %>%
# boxplot(
# col = palette()[-1],
# main = "Metabolite distribtution", ylab = "Metabolite concentration ")
The number of metabolites that do not satisfy normality at p = 0.05 is: 524
# Applying Shapirot_Wilks test for normality
variables <- log2_metabolites %>%
select(-patient_id) %>%
names()
apply_shapiro(log2_metabolites, variables, 0.05)
## [1] 524
The number of metabolites that were significantly different between PSC samples and controls at p = 0.05 is: 391
# Calculating the p-values
p_values <- metabolites %>%
apply_ttest(grp1 = c(1:33), grp2 = c(34:45))
# Plotting the distribution
hist(p_values)
# Calculating the amount of significant proteins
sum(p_values < 0.05)
## [1] 391
# pdf(file = "results/plots/p_val_histogram_metabolites.pdf")
mwu <- apply_mwu(log2_metabolites, grp1 = c(1:33), grp2 = c(34:45))
hist(mwu)
mwu <- mwu %>%
full_join(metadata_metabolites %>%
select(BIOCHEMICAL, 'SUPER PATHWAY', 'COMP ID') %>%
mutate_at("COMP ID", as.character),
by = c("variable" = "COMP ID"))
datatable(mwu,
filter = "top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))
The number of significantly differentially expressed miRNA at p < 0.01 is 276, of which 95 are upregulated and 223 are downregulated.
mwu$label <- ifelse((mwu$p.value < 0.01 & (mwu$fold_change < -3 | mwu$fold_change > 5.2) | mwu$p.value < 0.0000000001 ), mwu$`SUPER PATHWAY`, NA)
# pdf(file = "results/plots/volcano_metabolites.pdf")
ggplot(mwu, aes(x = fold_change, y = -log10(p.value), color = ifelse(p.value < 0.05, "Significant", "Non-significant"))) +
geom_point(alpha = 0.8, size = 1.5) +
geom_label_repel(aes(label = label),
size = 3,
box.padding = 0.3,
max.overlaps = 10,
color = "black",
show.legend = FALSE) +
scale_color_manual(values = c("Non-significant" = "royalblue", "Significant" = "firebrick")) +
theme_minimal(base_size = 14) +
theme(panel.background = element_rect(fill = "lightgray", color = NA)) +
labs(x = "Fold Change", y = "-Log10(p-value)", color = "p < 0.01") +
ggtitle("Volcano Plot of Mann-Whitney test of metabolites")
We check the plausible confounders and the studied effects in the data
beta1 <- log2_metabolites %>%
dplyr::slice(1:36) %>%
dplyr::select(-patient_id) %>%
t() %>%
as.data.frame()
pd1 <- metadata %>%
# dplyr::slice(1:33) %>%
select(cca_binary, ibd_binary, crohn_or_uc, fibrosis, fibrosis_binary, alp, alp_binary, bilirubin, bilirubin_binary, group, sex, age, age_cat, bmi, bmi_cat, crp, alat, IgG, IgA, auto_anb, overlap_aih, steroids_other_medic, jaundice, pruritis, fever_chill, fatigue, no_symptoms) %>%
as.data.frame()
pdf(file = "results/plots/svd_metabolomics.pdf")
champ.SVD(beta = beta1, pd = pd1)
# while (!is.null(dev.list())) dev.off()
Using MDS plots to check the grouping of the data according to the confouders found by SVD. Age effect is clearly visible in the data. It may be necessary to adjust for it or to do some feature selection.
# pdf(file = "results/plots/metabolites_MDS_by_age.pdf")
log2_metabolites %>%
dplyr::select(-patient_id) %>%
# dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 2),
col=pal[factor(metadata$bilirubin_binary[1:33])])
log2_metabolites %>%
dplyr::select(-patient_id) %>%
# dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 3),
col=pal[factor(metadata$bilirubin_binary[1:33])])
log2_metabolites %>%
dplyr::select(-patient_id) %>%
# dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(2, 3),
col=pal[factor(metadata$bilirubin_binary[1:33])])
Looking at the groups (early PSC, CCA, IBD, Advanced and progressing): there is some grouping but not good enough: some feature selection may be needed.
# pdf(file = "results/plots/metabolites_MDS.pdf")
log2_metabolites %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 2),
col=pal[factor(metadata$group[1:33])])
log2_metabolites %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(1, 3),
col=pal[factor(metadata$group[1:33])])
log2_metabolites %>%
dplyr::select(-patient_id) %>%
dplyr::slice(1:33) %>%
t() %>%
plotMDS(
top=1000,
gene.selection="common",
dim.plot = c(2, 3),
col=pal[factor(metadata$group[1:33])])
We scaled (z-score) the data before clustering.
Why Scaling Matters for K-Means: K-means calculates distances between data points using all features. If features are on different scales (e.g., miRNAs range from 0-10, proteins range from 100-10,000), features with larger magnitudes dominate the distance calculation, biasing the clustering results.
res_km_r <- eclust(scale(miRNA[, 2:ncol(miRNA)]), "kmeans")
fviz_gap_stat(res_km_r$gap_stat)
# pdf(file = "results/Illustrations/transcriptomics_kmean_k4.pdf")
res_km_r <- eclust(scale(miRNA %>% dplyr::select(-patient_id)), "kmeans", k = 3)
res_km_r <- eclust(scale(miRNA %>% dplyr::select(-patient_id)), FUNcluster = "hclust", k = 4, graph = TRUE)
fviz_cluster(res_km_r, scale(miRNA %>% dplyr::select(-patient_id)), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())
res_km_r <- eclust(scale(proteins %>% dplyr::select(-patient_id)), "kmeans")
# pdf(file = "results/plots/proteomics_kmean_k3.pdf")
fviz_gap_stat(res_km_r$gap_stat)
res_km_r <- eclust(scale(proteins %>% dplyr::select(-patient_id)), "kmeans", k = 3)
res_km_r <- eclust(scale(proteins %>% dplyr::select(-patient_id)), FUNcluster = "hclust", k = 3, graph = TRUE)
fviz_cluster(res_km_r, scale(proteins %>% dplyr::select(-patient_id)), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())
Scaled, Best k = 4
res_km_r <- eclust(scale(log2_metabolites %>% dplyr::select(-patient_id)), "kmeans")
# pdf(file = "results/plots/metabolites_kmean_k3.pdf")
fviz_gap_stat(res_km_r$gap_stat)
res_km_r <- eclust(scale(log2_metabolites %>% dplyr::select(-patient_id)), "kmeans", k = 3)
res_km_r <- eclust(scale(log2_metabolites), FUNcluster = "hclust", k = 3, graph = TRUE)
fviz_cluster(res_km_r, scale(log2_metabolites %>% dplyr::select(-patient_id)), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())
Non scaled
res_km_r <- eclust(log2_metabolites %>% dplyr::select(-patient_id), "kmeans")
fviz_gap_stat(res_km_r$gap_stat)
# pdf(file = "results/Paper_illustratios/kmean_lasso_grpcolor.pdf")
res_km_r <- eclust(log2_metabolites%>% dplyr::select(-patient_id), "kmeans", k = 4)
# while (!is.null(dev.list())) dev.off()
res_km_r <- eclust(log2_metabolites%>% dplyr::select(-patient_id), FUNcluster = "hclust", k = 4, graph = TRUE)
fviz_cluster(res_km_r, log2_metabolites%>% dplyr::select(-patient_id), ellipse.type = "ellipse", palette = c("#0b5313", "#ec4dd8", "blue", "orange"), ggtheme = theme_minimal())